import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
from urllib.request import urlopen
import squarify
import folium
import json
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm
from pmdarima.arima import auto_arima
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from math import sqrt
Data = pd.read_csv('valeursfoncieres-2022.txt', sep='|')
Data
| Identifiant de document | Reference document | 1 Articles CGI | 2 Articles CGI | 3 Articles CGI | 4 Articles CGI | 5 Articles CGI | No disposition | Date mutation | Nature mutation | ... | Surface Carrez du 5eme lot | Nombre de lots | Code type local | Type local | Identifiant local | Surface reelle bati | Nombre pieces principales | Nature culture | Nature culture speciale | Surface terrain | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 03/01/2022 | Vente | ... | NaN | 1 | 2.0 | Appartement | NaN | 24.0 | 1.0 | NaN | NaN | NaN |
| 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 03/01/2022 | Vente | ... | NaN | 0 | NaN | NaN | NaN | NaN | NaN | S | NaN | 84.0 |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 03/01/2022 | Vente | ... | NaN | 0 | NaN | NaN | NaN | NaN | NaN | S | NaN | 88.0 |
| 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 03/01/2022 | Vente | ... | NaN | 1 | 2.0 | Appartement | NaN | 140.0 | 3.0 | NaN | NaN | NaN |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 04/01/2022 | Vente | ... | NaN | 0 | NaN | NaN | NaN | NaN | NaN | T | NaN | 510.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3803880 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 30/12/2022 | Vente | ... | NaN | 1 | 4.0 | Local industriel. commercial ou assimilé | NaN | 327.0 | 0.0 | NaN | NaN | NaN |
| 3803881 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 19/12/2022 | Vente | ... | NaN | 2 | 3.0 | Dépendance | NaN | 0.0 | 0.0 | NaN | NaN | NaN |
| 3803882 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 19/12/2022 | Vente | ... | NaN | 2 | 2.0 | Appartement | NaN | 40.0 | 3.0 | NaN | NaN | NaN |
| 3803883 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 30/12/2022 | Vente | ... | NaN | 1 | 3.0 | Dépendance | NaN | 0.0 | 0.0 | NaN | NaN | NaN |
| 3803884 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 21/11/2022 | Vente | ... | NaN | 1 | 3.0 | Dépendance | NaN | 0.0 | 0.0 | NaN | NaN | NaN |
3803885 rows × 43 columns
columns_to_keep = ['Date mutation','Nature mutation','Valeur fonciere','Code postal','Commune','Code departement','Code commune','Nombre de lots','Code type local','Type local','Surface reelle bati','Nombre pieces principales','Surface terrain']
Data['Date mutation'] = pd.to_datetime(Data['Date mutation'])
Data['Code departement'] = Data['Code departement'].astype(str)
Data = Data[columns_to_keep]
Data = Data.dropna()
Data['Valeur fonciere'] = pd.to_numeric(Data['Valeur fonciere'].str.replace(',', '.'))
Data
| Date mutation | Nature mutation | Valeur fonciere | Code postal | Commune | Code departement | Code commune | Nombre de lots | Code type local | Type local | Surface reelle bati | Nombre pieces principales | Surface terrain | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 2022-06-01 | Vente | 255000.0 | 1560.0 | MANTENAY-MONTLIN | 1 | 230 | 0 | 1.0 | Maison | 108.0 | 5.0 | 649.0 |
| 8 | 2022-03-01 | Vente | 525000.0 | 1390.0 | SAINT-ANDRE-DE-CORCY | 1 | 333 | 0 | 4.0 | Local industriel. commercial ou assimilé | 424.0 | 0.0 | 628.0 |
| 9 | 2022-03-01 | Vente | 525000.0 | 1390.0 | SAINT-ANDRE-DE-CORCY | 1 | 333 | 0 | 2.0 | Appartement | 126.0 | 4.0 | 628.0 |
| 13 | 2022-03-01 | Vente | 140000.0 | 1000.0 | BOURG-EN-BRESSE | 1 | 53 | 0 | 1.0 | Maison | 100.0 | 4.0 | 796.0 |
| 21 | 2022-10-01 | Vente | 580000.0 | 1500.0 | AMBRONAY | 1 | 7 | 0 | 3.0 | Dépendance | 0.0 | 0.0 | 496.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3803693 | 2022-12-28 | Vente | 12281000.0 | 75011.0 | PARIS 11 | 75 | 111 | 0 | 2.0 | Appartement | 78.0 | 3.0 | 265.0 |
| 3803694 | 2022-12-28 | Vente | 12281000.0 | 75011.0 | PARIS 11 | 75 | 111 | 0 | 2.0 | Appartement | 17.0 | 1.0 | 265.0 |
| 3803695 | 2022-12-28 | Vente | 12281000.0 | 75011.0 | PARIS 11 | 75 | 111 | 0 | 2.0 | Appartement | 49.0 | 3.0 | 265.0 |
| 3803705 | 2022-12-30 | Vente | 3100000.0 | 75012.0 | PARIS 12 | 75 | 112 | 0 | 4.0 | Local industriel. commercial ou assimilé | 709.0 | 0.0 | 258.0 |
| 3803723 | 2022-08-12 | Vente | 2600000.0 | 75014.0 | PARIS 14 | 75 | 114 | 0 | 1.0 | Maison | 180.0 | 8.0 | 163.0 |
1208299 rows × 13 columns
MUTATIONS = Data['Nature mutation'].unique()
def plotMutations(mut, data, ax):
for m in MUTATIONS:
temp = data[data['Nature mutation'] == m]
result = temp.groupby(temp['Date mutation'].dt.to_period("M"))['Valeur fonciere'].sum()
result.index = result.index.to_timestamp()
x = result.index
y = result.values
if m == mut:
ax.plot(x, y, color="#0b53c1", lw=2.4, zorder=10)
ax.scatter(x, y, fc="w", ec="#0b53c1", s=60, lw=2.4, zorder=12)
ax.autoscale()
else:
ax.plot(x, y, color="#BFBFBF", lw=1.5)
ax.set_title(mut, fontfamily="DejaVu Sans", fontsize=14, fontweight=500)
return ax
fig, axes = plt.subplots(2, 3, figsize=(14, 7.5))
for idx, (ax, mut) in enumerate(zip(axes.ravel(), MUTATIONS)):
# Only annotate the first panel
annotate = idx == 0
plotMutations(mut, Data, ax)
data1 = Data[Data['Nature mutation'] =='Vente']
data1 = data1.groupby(by='Date mutation',sort='Date mutation')['Valeur fonciere'].count()
data2 = Data[Data['Nature mutation'] =='Vente terrain à bâtir']
data2 = data2.groupby(by='Date mutation',sort='Date mutation')['Valeur fonciere'].count()
data3 = Data[Data['Nature mutation'] =='Echange']
data3 = data3.groupby(by='Date mutation',sort='Date mutation')['Valeur fonciere'].count()
data4 = Data[Data['Nature mutation'] =="Vente en l'état futur d'achèvement"]
data4 = data4.groupby(by='Date mutation',sort='Date mutation')['Valeur fonciere'].count()
data5 = Data[Data['Nature mutation'] =='Adjudication']
data5 = data5.groupby(by='Date mutation',sort='Date mutation')['Valeur fonciere'].count()
data6 = Data[Data['Nature mutation'] =='Expropriation']
data6 = data6.groupby(by='Date mutation',sort='Date mutation')['Valeur fonciere'].count()
plt.figure(figsize=(18,11))
plt.plot(data1.index, data1.values, "r--", color="red")
plt.plot(data2.index, data2.values, "r--", color="blue")
plt.plot(data3.index, data3.values, "r--", color="green")
plt.plot(data4.index, data4.values, "r--", color="yellow")
plt.plot(data5.index, data5.values, "r--", color="purple")
plt.plot(data6.index, data6.values, "r--", color="black")
plt.legend(['Vente','Vente terrain à bâtir', 'Echange',"Vente en l'état futur d'achèvement",'Adjudication','Expropriation'])
plt.title('Nombre de mutations par type au cours des mois, en cumulé')
plt.show()
df = Data.groupby(['Type local'])['Type local'].count()
plt.bar(df.index, df.values)
bars = ['Appartement', 'Dépendance', 'Industriel', 'Maison']
y_pos = np.arange(len(bars))
plt.xticks(y_pos, bars)
plt.title('Nombre de mutations par type')
Text(0.5, 1.0, 'Nombre de mutations par type')
fig, ax = plt.subplots()
ax.pie(df, labels=df.index, autopct='%1.1f%%')
ax.set_title('Proportion des types de locaux sur le nombre total de mutations')
plt.show()
df=Data[Data["Surface terrain"]< 5000]
plt.figure(figsize=(18,10))
plt.xticks(rotation=25)
sbn.violinplot(x = "Type local",y="Surface terrain", data=df)
plt.title('Répartition des types de locaux selon la surface de leur terrain')
Text(0.5, 1.0, 'Répartition des types de locaux selon la surface de leur terrain')
df=Data[(Data["Surface reelle bati"]< 1000) & (Data["Surface reelle bati"].notna())]
plt.figure(figsize=(18,10))
plt.xticks(rotation=25)
sbn.violinplot(x = "Type local",y="Surface reelle bati", data=df)
plt.title('Répartition des types de locaux selon leur surface réelle bâtie')
Text(0.5, 1.0, 'Répartition des types de locaux selon leur surface réelle bâtie')
df = Data[Data['Valeur fonciere'] < 2000000]
plt.figure(figsize=(18,10))
plt.xticks(rotation=25)
sbn.violinplot(x="Type local",y="Valeur fonciere",data=df)
plt.title('Répartition des types de locaux selon leur valeur foncière')
Text(0.5, 1.0, 'Répartition des types de locaux selon leur valeur foncière')
df = Data.groupby(['Code departement'])['Nature mutation'].count().sort_values(ascending=True)
plt.figure(figsize=(10,20))
plt.hlines(y=df.index, xmin=0, xmax=df.values, color='black')
plt.plot(df.values, df.index, "o", color="blue")
plt.title('Nombre de mutations par département')
plt.xlabel('Nombre de mutations')
plt.ylabel('Numéros de départements')
plt.show()
df1 = Data.groupby(['Code departement'])['Nature mutation'].count()
df = Data[((Data['Nature mutation']=='Vente') & ((Data['Type local'] == 'Maison') | (Data['Type local'] == 'Appartement')))]
df = Data.groupby(['Code departement'])['Nature mutation'].count()
plt.figure(figsize=(10,20))
plt.hlines(y=df1.index, xmin = 0, xmax = df1.values, color='red')
plt.hlines(y=df.index, xmin=0, xmax=df.values, color='skyblue')
plt.plot(df.values, df.index, "o")
plt.plot(df1.values, df1.index, "x", color="white")
plt.title("Nombre de mutations par département (ronds) et nombre de mutations par département en considérant uniquement les ventes d'appartements et de maisons (croix)")
plt.xlabel('Nombre de mutations')
plt.ylabel('Numéros de départements')
plt.show()
df_filtered = Data.loc[(Data['Nature mutation'] == 'Vente') & (Data['Type local'].isin(['Maison', 'Appartement']))]
df_mutations = df_filtered.groupby(['Code departement'])['Nature mutation'].count().reset_index()
df_mutations = df_mutations.sort_values(by='Nature mutation', ascending=False)
df_filtered2 = df_filtered.groupby(['Code departement', 'Type local'])['Type local'].count().unstack()
df_filtered2 = df_filtered2.fillna(0)
df_filtered2['Total'] = df_filtered2.sum(axis=1)
df_filtered2['% Maison/Appartement'] = ((df_filtered2['Maison'] + df_filtered2['Appartement']) / df_filtered2['Total']) * 100
df_filtered2 = df_filtered2.sort_values(by='% Maison/Appartement', ascending=False)
fig, ax = plt.subplots(figsize=(18, 12))
sbn.barplot(x='% Maison/Appartement', y=df_filtered2.index, data=df_filtered2, color='skyblue')
ax.set_title("Pourcentage de ventes d'appartements et de maisons par département")
ax.set_xlabel('% Maison/Appartement')
ax.set_ylabel('Numéro de département')
plt.show()
myscale = None
def mapping_france_folium(data):
map = folium.Map(location=[48.862, 2.346], zoom_start = 5)
departments = f"https://france-geojson.gregoiredavid.fr/repo/departements.geojson"
d = {'Code': data.index, 'Valeur': np.log(data.values)}
da = pd.DataFrame(d)
folium.Choropleth(geo_data=departments,
data=da,
columns=['Code', 'Valeur'],
key_on='properties.code',
fill_color= "PuRd",
fill_opacity=1,
line_opacity=.1).add_to(map)
folium.LayerControl().add_to(map)
return map
def mapping_Paris_circle(data, bigNumbers = False):
map = folium.Map(location = [48.856578, 2.351828], zoom_start = 12)
arr = json.load(open("arrondissements.geojson"))
d = {'Code': data.index, 'Valeur': data.values}
da = pd.DataFrame(d)
for a in arr["features"]:
prop = a["properties"]
temp = da[da['Code'] == prop["c_arinsee"] - 100]
temp = temp['Valeur'].values
folium.Circle(prop["geom_x_y"],
fill=True,
popup = prop["l_ar"],
radius = (temp[0]/1) if not bigNumbers else temp[0]/9000000).add_to(map)
return map
def mapping_Paris(data):
map = folium.Map(location = [48.856578, 2.351828], zoom_start = 12)
arr = json.load(open("arrondissements.geojson"))
d = {'Code': data.index + 100, 'Valeur': np.log(data.values)}
da = pd.DataFrame(d)
da = da[(da['Code'] >= 75100) & (da['Code'] <= 75120)]
myscale = np.linspace(da['Valeur'].min(), da['Valeur'].max(), 10)
folium.Choropleth(geo_data=arr,
data=da,
columns=['Code', 'Valeur'],
key_on='properties.c_arinsee',
fill_color= "PuRd",
threshold_scale=myscale,
fill_opacity=0.8,
line_opacity=.1).add_to(map)
folium.LayerControl().add_to(map)
return map
def mapping_Lyon(data):
map = folium.Map(location = [45.763420, 4.834277], zoom_start = 12)
arr = json.load(open("lyon.json"))
d = {'Code': data.index + 380, 'Valeur': np.log(data.values)}
da = pd.DataFrame(d)
da = da[(da['Code'] >= 69381) & (da['Code'] <= 69389)]
folium.Choropleth(geo_data=arr,
data=da,
columns=['Code', 'Valeur'],
key_on='properties.insee',
fill_color= "PuRd",
threshold_scale=myscale,
fill_opacity=0.8,
line_opacity=.1).add_to(map)
folium.LayerControl().add_to(map)
return map
def mapping_Marseille(data):
map = folium.Map(location = [43.296482, 5.36978], zoom_start = 12)
arr = json.load(open("marseille.geojson"))
d = {'Code': data.index + 200, 'Valeur': np.log(data.values)}
da = pd.DataFrame(d)
da = da[(da['Code'] >= 13201) & (da['Code'] <= 13216)]
da['Code'] = da['Code'].astype(int).astype(str)
folium.Choropleth(geo_data=arr,
data=da,
columns=['Code', 'Valeur'],
key_on='properties.DEPCO',
fill_color= "PuRd",
threshold_scale=myscale,
fill_opacity=0.8,
line_opacity=.1).add_to(map)
folium.LayerControl().add_to(map)
return map
data = Data.groupby(['Code departement'])['Nature mutation'].count()
map = mapping_france_folium(data)
map
data = Data[Data['Nature mutation'] == 'Vente'].groupby(['Code departement'])['Valeur fonciere'].sum()
map = mapping_france_folium(data)
map
data = Data[Data['Nature mutation'] == 'Vente']
data = data[data['Type local'] == 'Maison'].groupby(['Code departement'])['Valeur fonciere'].sum()
map = mapping_france_folium(data)
map
data = Data[(Data['Nature mutation'] == 'Vente') & ((Data['Type local'] == 'Maison') | (Data['Type local'] == 'Appartement'))]
data['prix_m2'] = data['Valeur fonciere']/data['Surface reelle bati']
data = data.groupby(['Code departement'])['prix_m2'].agg('mean')
map = mapping_france_folium(data)
map
data = Data[['Surface reelle bati','Valeur fonciere']]
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18,10))
# Premier graphique avec échelle logarithmique
ax1.set_xscale('log')
ax1.set_yscale('log')
ax1.scatter(data['Surface reelle bati'],data['Valeur fonciere'])
ax1.set_title('Répartition de la valeur foncière en fonction de la surface bâtie (échelle logarithmique)')
ax1.set_xlabel('Surface bâtie')
ax1.set_ylabel('Valeur foncière')
# Deuxième graphique sans échelle logarithmique
ax2.scatter(data['Surface reelle bati'],data['Valeur fonciere'])
ax2.set_title('Répartition de la valeur foncière en fonction de la surface bâtie')
ax2.set_xlabel('Surface bâtie')
ax2.set_ylabel('Valeur foncière')
plt.show()
data = Data.groupby(['Code postal'])['Nature mutation'].count()
map = mapping_Paris_circle(data, False)
map
data = Data.groupby(['Code postal'])['Nature mutation'].count()
map = mapping_Paris(data)
map
data = Data[Data['Nature mutation'] == 'Vente']
data = data.groupby(['Code postal'])['Valeur fonciere'].sum()
map = mapping_Paris_circle(data, True)
map
data = Data[Data['Nature mutation'] == 'Vente']
data = data.groupby(['Code postal'])['Valeur fonciere'].sum()
map = mapping_Paris(data)
map
data = Data[Data['Nature mutation'] == 'Vente']
data = data.groupby(['Code postal'])['Valeur fonciere'].sum()
map = mapping_Marseille(data)
map
data = Data[Data['Nature mutation'] == 'Vente']
data = data.groupby(['Code postal'])['Valeur fonciere'].sum()
map = mapping_Lyon(data)
map
data_75016 = Data[Data['Code postal'] == 75016.0]
data_75016 = data_75016[['Date mutation', 'Valeur fonciere']]
data_75016['Date mutation'] = pd.to_datetime(data_75016['Date mutation'])
data_75016 = data_75016.set_index('Date mutation')
data_75016['Rendement'] = data_75016['Valeur fonciere'].pct_change()
data_75016 = data_75016.dropna()
#Série temporelle à fréquence hebdommadaire on cherche avec ARIMA à prédire une tendance
data_w = data_75016['Rendement'].resample('W').mean().dropna()
data_w = data_w.iloc[1:]
result = adfuller(data_75016['Rendement'])
print('p-value: %f' % result[1])
p-value: 0.000000
split_point = int(len(data_w) * 0.95)
train, test = data_w[0:split_point], data_w[split_point:]
model = auto_arima(train, start_p=0, start_q=0,
max_p=5, max_q=5,
m=12,
d=1,
seasonal=False,
start_P=0,
D=None,
trace=True,
error_action='ignore',
suppress_warnings=True,
stepwise=True)
predictions = model.predict(n_periods=len(test))
plt.figure(figsize=(10,6))
plt.plot(test.index, predictions, color='red', label='Predictions')
plt.plot(test.index, test, color='blue', label='Actual values')
plt.title('Actual vs Predicted')
plt.xlabel('Date')
plt.ylabel('Rendement')
plt.legend()
plt.show()
Performing stepwise search to minimize aic ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=689.149, Time=0.03 sec ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=691.144, Time=0.03 sec ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=691.133, Time=0.13 sec ARIMA(0,1,0)(0,0,0)[0] : AIC=688.062, Time=0.02 sec ARIMA(1,1,1)(0,0,0)[0] intercept : AIC=inf, Time=0.36 sec Best model: ARIMA(0,1,0)(0,0,0)[0] Total fit time: 0.564 seconds
def autoarima(df, time_predict):
max_lags = df.shape[0] // 2 - 1
model = auto_arima(df, start_p=0, start_q=0,
max_p=5, max_q=5,
m=12,
d=1,
seasonal=False,
start_P=0,
D=None,
trace=True,
error_action='ignore',
suppress_warnings=True,
stepwise=True)
prediction = model.predict(n_periods=time_predict)
prediction_list = np.array(prediction).tolist()
return prediction_list
time_predict = 3 # Prévoir pour les 3 premières semaines
predictions = autoarima(data_75016['Rendement'], time_predict)
print(f"Les prédictions sont : {predictions}")
Performing stepwise search to minimize aic ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=3919.882, Time=0.02 sec ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=3850.476, Time=0.05 sec ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=inf, Time=0.18 sec ARIMA(0,1,0)(0,0,0)[0] : AIC=3917.884, Time=0.02 sec ARIMA(2,1,0)(0,0,0)[0] intercept : AIC=3823.352, Time=0.07 sec ARIMA(3,1,0)(0,0,0)[0] intercept : AIC=3809.481, Time=0.09 sec ARIMA(4,1,0)(0,0,0)[0] intercept : AIC=3801.459, Time=0.16 sec ARIMA(5,1,0)(0,0,0)[0] intercept : AIC=3796.578, Time=0.20 sec ARIMA(5,1,1)(0,0,0)[0] intercept : AIC=inf, Time=1.17 sec ARIMA(4,1,1)(0,0,0)[0] intercept : AIC=inf, Time=0.88 sec ARIMA(5,1,0)(0,0,0)[0] : AIC=3794.578, Time=0.11 sec ARIMA(4,1,0)(0,0,0)[0] : AIC=3799.459, Time=0.08 sec ARIMA(5,1,1)(0,0,0)[0] : AIC=inf, Time=0.92 sec ARIMA(4,1,1)(0,0,0)[0] : AIC=inf, Time=0.68 sec Best model: ARIMA(5,1,0)(0,0,0)[0] Total fit time: 4.665 seconds Les prédictions sont : [1.8138262819048268, 2.1115610081287386, 2.4544915903343405]
mse = mean_squared_error(test, predictions)
rmse = sqrt(mse)
median_target = np.median(test)
rmse_vs_median = rmse / median_target
print(f"RMSE : {rmse}")
print(f"Médiane de la variable cible : {median_target}")
print(f"RMSE en pourcentage de la médiane : {rmse_vs_median * 100}%")
RMSE : 2.3374249142461476 Médiane de la variable cible : 0.062157567249081704 RMSE en pourcentage de la médiane : 3760.483264860531%
data = Data[Data['Code departement'] == '75']
data = data[['Valeur fonciere','Surface reelle bati','Surface terrain', 'Nombre pieces principales']]
sbn.heatmap(data.corr(), annot= True, cmap='Reds')
plt.xticks(rotation = 45)
plt.title('Corrélation entre la valeur foncière, la surface réelle bâtie, \nla surface du terrain et le nombre de pièces principales\n(en IDF))')
plt.show()